class CLONE:
    def __init__(self):
        self.text_encoder = TextEncoder()
        self.prosody_predictor = ProsodyPredictor()
        self.duration_predictor = DurationPredictor()
        self.acoustic_encoder = AcousticEncoder()
        self.wave_decoder = WaveDecoder()
        self.posterior_encoder = PosteriorEncoder()
        self.flow = Flow()
        self.posterior_wave_encoder = PosteriorWaveEncoder()
        self.speaker_embedding = Embedding()
        self.mel_predictor = LinearLayer()

    def training(self, textual_information, durations, linear_spectrogram, mel_spectrogram, speaker_id):
        # Get speaker embedding
        speaker_embedding = self.speaker_embedding(speaker_id)
        # Text encoder
        text_encoder_output = self.text_encoder(textual_information)
        # Prosody predictor
        z, mean_pp, variance_pp = self.prosody_predictor(text_encoder_output.detach(), speaker_embedding.detach())
        # Get alignment matrix
        alignment_matrix = get_alignment_matrix(durations)
        # To get linguistic feature, expand text encoder output
        linguistic_feature = text_encoder_output @ alignment_matrix
        # Posterior encoder
        z_q, mean_q, variance_q = self.posterior_encoder(linear_spectrogram,
                                                         linguistic_feature,
                                                         alignment_matrix,
                                                         durations,
                                                         speaker_embedding)
        # Calculate KL divergence of prosody predictor
        l_pp = KL_divergence_close_form(mean_pp, variance_pp, mean_q.detach(), variance_q.detach())
        # Duration predictor
        durations_hat = self.duration_predictor(text_encoder_output, z_q, speaker_embedding)
        # Calculate duration loss
        l_dur = l1_loss(durations_hat, durations)
        # Flow
        z_p, det = self.flow(z_q)
        # Calculate KL divergence of VAE with normalizing flow
        l_kl = KL_divergence_non_close_form(z_p, det, variance_q)
        # Dual parallel autoencoder
        ir_hat = self.acoustic_encoder(linguistic_feature, z_q, alignment_matrix, speaker_embedding)
        ir = self.posterior_wave_encoder(linear_spectrogram)
        # Calculate loss of intermediate representation
        l_ir = self.l1_loss(ir_hat, ir)
        # Concatenate intermediate representations
        ir_concat = concatenate([ir, ir_hat], dim=batch_dim)
        # Wave decoder
        waveform = self.wave_decoder(ir_concat)
        # Convert waveform to mel-spectrogram
        mel_1 = convert_to_melspectrogram(waveform)
        # Predict mel-spectrogram from intermediate representation
        mel_2 = self.mel_predictor(ir_concat)
        # Calculate reconstruction loss
        l_recon = l1_loss(mel_1, mel_spectrogram) + l1_loss(mel_2, mel_spectrogram)
        # Loss of generator without adversarial training loss
        loss_generator = l_pp + l_dur + l_kl + l_ir + l_recon
        return loss_generator, waveform

    # End-to-end inference mode
    def infer_end2end_mode(self, textual_information, speaker_id):
        speaker_embedding = self.speaker_embedding(speaker_id)
        text_encoder_output = self.text_encoder(textual_information)
        # Get z of prosody from prosody predictor
        z, _, _ = self.prosody_predictor(text_encoder_output, speaker_embedding)
        durations_hat = self.duration_predictor(text_encoder_output, z, speaker_embedding)
        alignment_matrix = get_alignment_matrix(durations_hat)
        linguistic_feature = text_encoder_output @ alignment_matrix
        ir_hat = self.acoustic_encoder(linguistic_feature, z, alignment_matrix, speaker_embedding)
        waveform = self.wave_decoder(ir_hat)
        return waveform

    # Posterior encoder mode
    def infer_reconstruction(self, textual_information, durations, linear_spectrogram, speaker_id):
        speaker_embedding = self.speaker_embedding(speaker_id)
        text_encoder_output = self.text_encoder(textual_information)
        alignment_matrix = get_alignment_matrix(durations)
        linguistic_feature = text_encoder_output @ alignment_matrix
        # Get z of prosody from posterior encoder
        z_q, mean_q, variance_q = self.posterior_encoder(linear_spectrogram,
                                                         linguistic_feature,
                                                         alignment_matrix,
                                                         durations,
                                                         speaker_embedding)
        ir_hat = self.acoustic_encoder(linguistic_feature, z_q, alignment_matrix, speaker_embedding)
        waveform = self.wave_decoder(ir_hat)
        return waveform

    # Flow mode
    def inference_sample_from_flow(self, textual_information, sampling_result, speaker_id):
        speaker_embedding = self.speaker_embedding(speaker_id)
        text_encoder_output = self.text_encoder(textual_information)
        durations_hat = self.duration_predictor(text_encoder_output, z_q, speaker_embedding)
        alignment_matrix = get_alignment_matrix(durations_hat)
        linguistic_feature = text_encoder_output @ alignment_matrix
        # Get z of prosody from flow
        z_q = self.flow.inverse(sampling_result)
        ir_hat = self.acoustic_encoder(linguistic_feature, z_q, alignment_matrix, speaker_embedding)
        waveform = self.wave_decoder(ir_hat)
        return waveform
